GitHub in OUP “Bioinformatics”

# Build query with public repo to include and list to exclude
ti_ab <- '[Title/Abstract]'
build_query <- function(incl_repo, excl_repos) {
  rtrn <- paste(incl_repo, ti_ab, sep = "")
  for(repo in excl_repos) {
    rtrn <- paste(rtrn, " NOT ", repo, ti_ab, sep = "")
  }
  rtrn <- paste(rtrn, 'AND "Bioinformatics (Oxford, England)"[Journal]')
  rtrn
}

# Get count of results for query and year
get_res_count <- function(query, year) {
  tryCatch({
    res <- EUtilsSummary(query = query, 
                         db = "pubmed", 
                         mindate = paste(year, "/01/01", sep = ""), 
                         maxdate = paste(year, "/12/31", sep = ""), 
                         retmax = 5000,
                         datetype = "ppdt")
    QueryCount(res)
  }, error = function(e) 0)
}

# Specific queries
num_abs_year <- function(incl_repo, excl_repos, year) get_res_count(build_query(incl_repo, excl_repos), year)
total_bioinf_year <- function(year) get_res_count('"Bioinformatics (Oxford, England)"[Journal]', year)

make_count_table <- function(years, repos) {
  rtrn <- data.frame(repo = character(), year = integer(), num_abstracts = integer(), total_articles = integer())
  for(year in years) {
    total_articles <- total_bioinf_year(year)
    for(repo in repos) {
      num_abstracts <- num_abs_year(repo, setdiff(repos, repo), year)
      rtrn <- rbind(rtrn, data.frame(repo = repo, year = year, num_abstracts = num_abstracts, total_articles = total_articles))
    }
  }
  rtrn
}

count_table <- make_count_table(2009:2017, c("GitHub", "Bitbucket", "SourceForge"))
count_table <- rbind(count_table, 
                     data.frame(count_table %>% 
                                  group_by(year) %>% 
                                  mutate(other = total_articles - sum(num_abstracts)) %>% 
                                  select(year, other) %>% 
                                  unique() %>% 
                                  mutate(repo = "Abstracts with none of these") %>% 
                                  rename(num_abstracts = other) %>% 
                                  select(repo, year, num_abstracts) %>% 
                                  left_join(count_table %>% 
                                              select(year, total_articles) %>% 
                                              unique(), 
                                            by = "year")))
count_table$repo <- factor(count_table$repo, levels = c("Abstracts with none of these", "Bitbucket", "SourceForge", "GitHub"))

ggplot() + 
  geom_bar(data = count_table %>% 
             mutate(year = as.character(year)), 
           aes(y = num_abstracts, x = year, fill = repo), 
           stat = "identity") +
  xlab("Year of publication in Bioinformatics (Oxford University Press)") +
  ylab("Number of abstracts containing repository name") +
  guides(fill = guide_legend(title = "Repository")) +
  theme_bw() +
  theme(axis.title.x = element_text(size = 16),
        axis.title.y = element_text(size = 16),
        legend.title = element_text(size = 16),
        legend.text = element_text(size = 14),
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14)) +
  scale_fill_manual(values=c("gray", "#4DAF4A", "#377EB8", "#E41A1C"))

ggsave("abs_bioinf_repo.png")
## Saving 10 x 8 in image
ggsave("abs_bioinf_repo.pdf")
## Saving 10 x 8 in image

Topic modeling

plt_data_topics <- as.tbl(repo_data_main) %>% 
  select(commits, commit_authors, forks_count, subscribers_count, 
         stargazers_count, num_citations_per_week_pmc_minus_2_years, 
         total_file_size_no_data, num_files_no_data, contains("topic")) %>% 
  dplyr::rename(
    Commits = commits,
    `Commit authors` = commit_authors,
    Forks = forks_count,
    Subscribers = subscribers_count,
    Stargazers = stargazers_count,
    `Mean PMC citations / week` = num_citations_per_week_pmc_minus_2_years,
    `Total files` = num_files_no_data
  )

# Change NA's
change_na <- function(x, c) {
  if(is.na(x)) c
  else x
}
na_to_zero <- function(x) change_na(x, 0)
na_to_one <- function(x) change_na(x, 1)
plt_data_topics[["Commits"]] <- sapply(plt_data_topics[["Commits"]], na_to_zero)
plt_data_topics[["Commit authors"]] <- sapply(plt_data_topics[["Commit authors"]], na_to_zero)
plt_data_topics[["Mean PMC citations / week"]] <- sapply(plt_data_topics[["Mean PMC citations / week"]], na_to_zero)
plt_data_topics[["total_file_size_no_data"]] <- sapply(plt_data_topics[["total_file_size_no_data"]], na_to_one)

plt_data_topics <- 
  plt_data_topics %>% 
  dplyr::mutate(`Megabytes of code` = total_file_size_no_data / 1000000,
                `Forks + 1` = Forks + 1,
                `Subscribers + 1` = Subscribers + 1,
                `Stargazers + 1` = Stargazers + 1,
                `1 + mean PMC citations / week` = `Mean PMC citations / week` + 1) %>%
  select(-total_file_size_no_data, -Forks, -Subscribers, -Stargazers, -`Mean PMC citations / week`) %>%
  melt(id.vars = c("Commits", "Commit authors", "Forks + 1", 
                   "Subscribers + 1", "Stargazers + 1", "1 + mean PMC citations / week",
                   "Megabytes of code", "Total files")) %>% 
  filter(value) %>% 
  as.tbl() %>% 
  select(-value) %>% 
  dplyr::rename(Topic = variable) %>% 
  melt(id.vars = "Topic") 

# Add number of repos per topic
topic_cols <- as.character(unique(plt_data_topics$Topic))
topic_ct <- unlist(map(topic_cols, function(x) sum(repo_data_main[[x]])))
names(topic_ct) <- topic_cols
topic_with_ct <- unlist(map(plt_data_topics$Topic, function(x) {
  paste(x, " (N=", topic_ct[[x]], ")", sep = "")
}))
plt_data_topics$Topic <- topic_with_ct

plt_data_topics <-
  plt_data_topics %>%
  mutate(Topic = gsub("topic_", "", Topic)) %>%
  mutate(Topic = gsub("_", " ", Topic)) %>%
  mutate(Topic = gsub("RNA.seq", "RNA-seq", Topic))

# Make the plot
ggplot(plt_data_topics, 
       aes(x = variable, y = value, fill = factor(Topic))) +
  geom_boxplot(notch = T) +
  theme_bw() +
  guides(fill = guide_legend(title="Abstract includes topic")) +
  theme(legend.text = element_text(size=13),
        legend.title = element_text(size = 15),
        axis.text.y = element_text(size = 14),
        axis.text.x = element_blank(),
        axis.title = element_blank(),
        strip.text = element_text(size = 14),
        legend.position = c(0.835, 0.17)) +
  scale_fill_brewer(palette="Dark2") +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  scale_y_log10(breaks = c(0.1, 1, 10, 100, 1000, 10000), labels = comma)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 146 rows containing non-finite values (stat_boxplot).
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

ggsave("topics.png")
## Saving 12 x 8 in image
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 146 rows containing non-finite values (stat_boxplot).
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
ggsave("topics.pdf")
## Saving 12 x 8 in image
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 146 rows containing non-finite values (stat_boxplot).
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

Commits after publication

plt_data_commits_after_pub <- as.tbl(repo_data_main) %>% 
  select(commits, commit_authors, forks_count, subscribers_count, 
         stargazers_count, num_citations_per_week_pmc_minus_2_years, 
         mean_commit_message_len, pct_commits_diff_author_committer, 
         num_non_committing_authors, commits_after_article_in_pubmed) %>% 
  filter(!is.na(commits_after_article_in_pubmed)) %>%
  dplyr::rename(
    `Total commits` = commits,
    `Commit authors` = commit_authors, 
    `Total forks` = forks_count, 
    `Total subscribers` = subscribers_count, 
    `Total stargazers` = stargazers_count,
    `PMC citations / week` = num_citations_per_week_pmc_minus_2_years,
    `Commit message length` = mean_commit_message_len,
    `Pct outside commits` = pct_commits_diff_author_committer,
    `Outside commit authors` = num_non_committing_authors,
    `Commits after\npublication` = commits_after_article_in_pubmed
  ) %>% 
  melt(id.vars = "Commits after\npublication")

# Get smallest positive value of each variable so we can take logs
min_pos <- plt_data_commits_after_pub %>% 
  filter(value > 0) %>%
  group_by(variable) %>%
  dplyr::summarize(min_pos = min(value))

# Remove top outliers for plot
p_outlier <- 1 # 1 means no filtering for outliers
outlier_cutoff <- plt_data_commits_after_pub %>%
  group_by(variable) %>%
  dplyr::summarize(outlier_cutoff = quantile(value, probs = p_outlier, na.rm = T))

plt_data_commits_after_pub <- plt_data_commits_after_pub %>% 
  left_join(min_pos, by = "variable") %>%
  left_join(outlier_cutoff, by = "variable")

# Replace 0's and NA's by minimum positive value
plt_data_commits_after_pub$value_pos <- apply(plt_data_commits_after_pub, 1, function(row) {
  val <- as.numeric(row["value"])
  mp <- as.numeric(row["min_pos"])
  if(is.na(val)) mp
  else max(val, mp)
})

# Replace NA's by minimum positive value
plt_data_commits_after_pub$value_non_na <- apply(plt_data_commits_after_pub, 1, function(row) {
  val <- as.numeric(row["value"])
  mp <- as.numeric(row["min_pos"])
  if(is.na(val)) mp
  else val
})

# Change true/false to yes/no
label_yes <- paste("Yes (N=", nrow(repo_data_main %>% filter(commits_after_article_in_pubmed)), ")", sep = "")
label_no <- paste("No (N=", nrow(repo_data_main %>% filter(!commits_after_article_in_pubmed)), ")", sep = "")
plt_data_commits_after_pub[["Commits after\npublication"]] <- 
  unlist(map(plt_data_commits_after_pub[["Commits after\npublication"]], 
             function(x) {
               if(x) label_yes
               else label_no
             }))

plt_data_commits_after_pub <- plt_data_commits_after_pub %>%
  filter(value_non_na <= outlier_cutoff) %>%
  select(`Commits after\npublication`, variable, value_non_na, value_pos)

# T test for each variable
t_test_commits_after_pub <- 
  plt_data_commits_after_pub %>% 
  group_by(variable) %>% 
  dplyr::summarize(
    p = t.test(value_non_na ~ `Commits after\npublication`, paired = FALSE, alternative = "two.sided")$p.value, 
    x = 0.5, 
    y = max(value_non_na)
  )

ggplot(plt_data_commits_after_pub, aes(variable, value_pos)) +
  geom_boxplot(aes(fill = `Commits after\npublication`)) +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  scale_y_log10(labels = comma) +
  theme_bw() +
  theme(legend.text = element_text(size=14),
        legend.title = element_text(size = 16),
        axis.text.y = element_text(size = 14),
        axis.text.x = element_blank(),
        axis.title = element_blank(),
        strip.text = element_text(size = 14)) +
  geom_label(data = t_test_commits_after_pub,
             aes(x = x, y = y, label = paste("p =", formatC(p, format = "e", digits = 1))),
             fill = "white",
             size = 5.5,
             label.size = 0,
             color = "steelblue",
             hjust = 0) +
  scale_fill_brewer(palette = "Set2")

ggsave("commits_after_pub.png")
## Saving 11 x 8 in image
ggsave("commits_after_pub.pdf")
## Saving 11 x 8 in image
# Check if relationship between commit message length and commits after publication can be explained by team size
cor.test(repo_data_main$commit_authors, repo_data_main$mean_commit_message_len, method = "pearson")$estimate
##       cor 
## 0.1442577
cor.test(repo_data_main$commit_authors, repo_data_main$mean_commit_message_len, method = "pearson")$p.value
## [1] 1.896942e-09
# Make another version of the figure that is wide without outside commit authors
plt <- ggplot(plt_data_commits_after_pub %>% filter(variable != "Outside commit authors"), aes(variable, value_pos)) +
  geom_boxplot(aes(fill = `Commits after\npublication`)) +
  facet_wrap(~variable, scales = "free", ncol = 4) +
  scale_y_log10() +
  theme_bw() +
  theme(legend.text = element_text(size=10),
        legend.title = element_text(size = 11),
        axis.text.y = element_text(size = 10),
        axis.text.x = element_blank(),
        axis.title = element_blank(),
        strip.text = element_text(size = 10)) +
  geom_label(data = t_test_commits_after_pub %>% filter(variable != "Outside commit authors"),
             aes(x = x, y = y, label = paste("p =", formatC(p, format = "e", digits = 1))),
             fill = "white",
             label.size = 0,
             color = "steelblue",
             hjust = 0) +
  scale_fill_brewer(palette = "Set2")

plt

Outside contributors

plt_data_outside_contrib <- as.tbl(repo_data_all) %>% 
  select(forks_count, subscribers_count, stargazers_count, num_non_committing_authors, is_high_profile) %>% 
  mutate(forks_count = forks_count + 1,
         subscribers_count = subscribers_count + 1,
         stargazers_count = stargazers_count + 1,
         num_non_committing_authors = num_non_committing_authors + 1) %>%
  dplyr::rename(
    `Total forks + 1` = forks_count, 
    `Total subscribers + 1` = subscribers_count, 
    `Total stargazers + 1` = stargazers_count,
    `Outside commit authors + 1` = num_non_committing_authors
  ) %>%
  melt(id.vars = c("Outside commit authors + 1", "is_high_profile"))

# Correlation
corr_outside_contrib <- 
  plt_data_outside_contrib %>% 
  group_by(variable) %>% 
  dplyr::summarize(
    corr_pearson = cor.test(value, `Outside commit authors + 1`, method = "pearson")$estimate, 
    pval_pearson = cor.test(value, `Outside commit authors + 1`, method = "pearson")$p.value, 
    x = 1, 
    y = 10^(0.93 * log10(max(value)))
  )

# Collapse identical records
plt_data_outside_contrib <- 
  plt_data_outside_contrib %>%
  group_by(`Outside commit authors + 1`, is_high_profile, variable, value) %>%
  dplyr::summarize(`Num repos` = n())

# Make the plot
ggplot(plt_data_outside_contrib) +
  geom_point(aes(size = `Num repos`, 
                 x = `Outside commit authors + 1`, 
                 y = value, col = is_high_profile)) +
  geom_point(data = subset(plt_data_outside_contrib, is_high_profile),
             aes(size = `Num repos`, 
                 x = `Outside commit authors + 1`, 
                 y = value, col = is_high_profile)) +
  scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
  theme_bw() +
  guides(color = guide_legend(title = "Dataset")) +
  theme(legend.text = element_text(size=14),
        legend.title = element_text(size = 16),
        axis.text = element_text(size = 14),
        strip.text = element_text(size = 14),
        axis.title.y = element_blank()) +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  scale_y_log10(breaks = c(1, 10, 100, 1000, 10000), labels = comma) +
  scale_x_log10() +
  geom_label(data = corr_outside_contrib,
             aes(x = x, y = y, label = paste("r = ", signif(corr_pearson, digits = 2), "\np = ", formatC(pval_pearson, format = "e", digits = 1), sep = "")),
             fill = "white",
             label.size = 0,
             color = "mediumpurple4",
             hjust = 0)

ggsave("outside_contrib.png")
## Saving 10 x 3 in image
ggsave("outside_contrib.pdf")
## Saving 10 x 3 in image
# How many repos have outside contributors?
nrow(repo_data_main %>% filter(num_non_committing_authors > 0)) / nrow(repo_data_main)
## [1] 0.1412791
nrow(repo_data_high_prof %>% filter(num_non_committing_authors > 0)) / nrow(repo_data_high_prof)
## [1] 0.6956522

Commit authors and community

plt_data_devs_community <- as.tbl(repo_data_all) %>% 
  select(forks_count, subscribers_count, stargazers_count, commit_authors, is_high_profile) %>% 
  mutate(forks_count = forks_count + 1,
         subscribers_count = subscribers_count + 1,
         stargazers_count = stargazers_count + 1) %>%
  dplyr::rename(
    `Total forks + 1` = forks_count, 
    `Total subscribers + 1` = subscribers_count, 
    `Total stargazers + 1` = stargazers_count,
    `Commit authors` = commit_authors,
    `High profile` = is_high_profile
  ) %>%
  melt(id.vars = c("Commit authors", "High profile"))

# Correlation
corr_devs_community <- 
  plt_data_devs_community %>% 
  group_by(variable) %>% 
  dplyr::summarize(
    corr_pearson = cor.test(value, `Commit authors`, method = "pearson")$estimate, 
    pval_pearson = cor.test(value, `Commit authors`, method = "pearson")$p.value, 
    x = 1, 
    y = 10^(0.93 * log10(max(value)))
  )

# Collapse identical records
plt_data_devs_community <- 
  plt_data_devs_community %>%
  group_by(variable, `Commit authors`, `High profile`, value) %>%
  dplyr::summarize(`Num repos` = n())

ggplot(plt_data_devs_community) +
  geom_point(aes(x = `Commit authors`, 
                 y = value,
                 col = `High profile`,
                 size = `Num repos`)) +
  geom_point(data = subset(plt_data_devs_community, `High profile`),
             aes(x = `Commit authors`, 
                 y = value,
                 col = `High profile`,
                 size = `Num repos`)) +
  scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
  theme_bw() +
  guides(color = guide_legend(title = "Dataset")) +
  theme(legend.text = element_text(size=14),
        legend.title = element_text(size = 16),
        axis.text = element_text(size = 14),
        axis.title.y = element_blank(),
        axis.title.x = element_text(size = 16),
        strip.text = element_text(size = 14)) +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  scale_y_log10(breaks = c(1, 10, 100, 1000), labels = comma) +
  scale_x_log10() +
  geom_label(data = corr_devs_community,
             aes(x = x, y = y, label = paste("r = ", signif(corr_pearson, digits = 2), "\np = ", formatC(pval_pearson, format = "e", digits = 1), sep = "")),
             fill = "white",
             label.size = 0,
             color = "mediumpurple4",
             hjust = 0)
## Warning: Removed 4 rows containing missing values (geom_point).

ggsave("commit_authors.png")
## Saving 11 x 4 in image
## Warning: Removed 4 rows containing missing values (geom_point).
ggsave("commit_authors.pdf")
## Saving 11 x 4 in image
## Warning: Removed 4 rows containing missing values (geom_point).

Commits

plt_data_commits <- as.tbl(repo_data_all) %>% 
  select(commits, mean_commits_per_month, consecutive_months_with_commits, 
         commit_span_days, mean_files_added_per_month, num_days_new_files_added,
         consecutive_months_no_commits, is_high_profile) %>% 
  mutate(consecutive_months_no_commits = consecutive_months_no_commits + 1) %>%
  dplyr::rename(
    `Total commits` = commits,
    `Mean commits/month` = mean_commits_per_month,
    `Max cons. months with commits` = consecutive_months_with_commits,
    `Project duration (days)` = commit_span_days,
    `Mean new files per month` = mean_files_added_per_month,
    `Days with new files added` = num_days_new_files_added,
    `1 + max cons. months no commits` = consecutive_months_no_commits,
    `High profile` = is_high_profile
  ) %>%
  melt(id.vars = c("Total commits", "High profile"))

# Correlation
corr_commits <- 
  plt_data_commits %>% 
  group_by(variable) %>% 
  dplyr::summarize(
    corr_pearson = cor.test(value, `Total commits`, method = "pearson")$estimate, 
    pval_pearson = cor.test(value, `Total commits`, method = "pearson")$p.value, 
    x = 1, 
    y = 10^(0.93 * log10(max(value, na.rm = T)))
  )

# Collapse identical records
plt_data_commits <-
  plt_data_commits %>%
  group_by(`Total commits`, variable, value, `High profile`) %>%
  dplyr::summarize(`Num repos` = n())


ggplot(plt_data_commits) +
  geom_point(aes(x = `Total commits`, 
                 y = value,
                 col = `High profile`,
                 size = `Num repos`)) +
  geom_point(data = subset(plt_data_commits, `High profile`), 
             aes(x = `Total commits`, 
                 y = value,
                 col = `High profile`,
                 size = `Num repos`)) +
  scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
  theme_bw() +
  guides(color = guide_legend(title = "Dataset")) +
  theme(legend.text = element_text(size=14),
        legend.title = element_text(size = 16),
        axis.text = element_text(size = 14),
        axis.title.y = element_blank(),
        axis.title.x = element_text(size = 16),
        strip.text = element_text(size = 14)) +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  scale_y_log10(breaks = c(1, 10, 100, 1000), labels = comma) +
  scale_x_log10(breaks = c(1, 10, 100, 1000, 10000), labels = comma) +
  geom_label(data = corr_commits,
             aes(x = x, y = y, label = paste("r = ", signif(corr_pearson, digits = 2), "\np = ", formatC(pval_pearson, format = "e", digits = 1), sep = "")),
             fill = "white",
             label.size = 0,
             color = "mediumpurple4",
             hjust = 0)
## Warning: Removed 20 rows containing missing values (geom_point).

ggsave("commits.png")
## Saving 13 x 6 in image
## Warning: Removed 20 rows containing missing values (geom_point).
ggsave("commits.pdf")
## Saving 13 x 6 in image
## Warning: Removed 20 rows containing missing values (geom_point).

Languages: file sizes and lines of code

top_langs_as_header <- sapply(top_langs, format_lang_as_header)
lang_cols <- unname(c(sapply(top_langs_as_header, function(x) paste("num_files_", x, sep = "")), 
                      sapply(top_langs_as_header, function(x) paste("mean_lines_code_", x, sep = ""))))
plt_data_langs <- data.frame(`Number of files` = integer(), 
                             `Mean lines of code per file` = numeric(), 
                             is_high_profile = logical(), 
                             lang = character())
for(lang in top_langs_as_header) {
  col_nf <- paste("num_files_", lang, sep = "")
  col_loc <- paste("mean_lines_code_", lang, sep = "")
  plt_data_langs <- rbind(plt_data_langs,
                          repo_data_all %>% 
                            select(!!as.name(col_nf), !!as.name(col_loc), is_high_profile) %>% 
                            mutate(lang = lang) %>% 
                            rename(`Number of files` = !!as.name(col_nf), 
                                   `Mean lines of code per file` = !!as.name(col_loc)) %>% 
                            filter(`Number of files` > 0 & `Mean lines of code per file` > 0))
}

ggplot(plt_data_langs) +
  geom_point(aes(x = `Number of files`, 
                 y = `Mean lines of code per file`,
                 col = is_high_profile)) +
  geom_point(data = subset(plt_data_langs, is_high_profile), 
             aes(x = `Number of files`, 
                 y = `Mean lines of code per file`,
                 col = is_high_profile)) +
  scale_color_manual(values=c(color_main, color_high_prof), labels = c("Main repos", "High profile repos")) +
  theme_bw() +
  guides(color = guide_legend(title = "Dataset")) +
  theme(legend.text = element_text(size=14),
        legend.title = element_text(size = 16),
        axis.text = element_text(size = 14),
        strip.text = element_text(size = 14),
        axis.title = element_text(size = 16)) +
  facet_wrap(~lang, scales = "fixed", ncol = 5) +
  scale_y_log10(breaks = c(1, 10, 100, 1000, 10000), labels = c("1", "10", "100", "1K", "10K")) +
  scale_x_log10(breaks = c(1, 10, 100, 1000, 10000), labels = c("1", "10", "100", "1K", "10K"))

ggsave("languages.png")
## Saving 13 x 8 in image
ggsave("languages.pdf")
## Saving 13 x 8 in image

Bytes of code by language

lang_cols <- unname(c(sapply(top_langs_as_header, function(x) paste("bytes_", x, sep = ""))))
plt_data_lang_bytes <- data.frame(bytes = integer(), 
                                  is_high_profile = logical(), 
                                  lang = character())
for(lang in top_langs_as_header) {
  col_b <- paste("bytes_", lang, sep = "")
  plt_data_lang_bytes <- rbind(plt_data_lang_bytes,
                               repo_data_all %>% 
                                 select(!!as.name(col_b), is_high_profile) %>% 
                                 mutate(lang = lang) %>% 
                                 rename(bytes = !!as.name(col_b)))
}

plt_data_lang_bytes$Dataset <- sapply(plt_data_lang_bytes$is_high_profile,
                                      function(x) {
                                        if(x) "High profile repos"
                                        else "Main repos"
                                      })

ggplot(plt_data_lang_bytes, 
       aes(x = lang, y = bytes / 1000000, fill = Dataset)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values=c(color_high_prof, color_main)) +
  theme_bw() +
  theme(axis.text = element_text(size = 14),
        strip.text = element_text(size = 14),
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none",
        axis.title = element_text(size = 16),
        plot.margin = margin(10, 10, 10, 70)) +
  xlab("Language") +
  ylab("Total megabytes of code") +
  facet_wrap(~Dataset, scales = "free")

ggsave("bytes_by_lang.pdf")
## Saving 12 x 7 in image
ggsave("bytes_by_lang.png")
## Saving 12 x 7 in image

Years by topic

plt_data_years_by_topic <- repo_data_main %>%
  select(first_commit, date_pubmed, contains("topic")) %>%
  mutate(year_first_commit = year(first_commit),
         year_pubmed = year(date_pubmed)) %>%
  select(-first_commit, -date_pubmed) %>%
  melt(id.vars = c("year_first_commit", "year_pubmed")) %>%
  filter(value) %>%
  group_by(year_first_commit, year_pubmed, variable) %>%
  dplyr::summarize(`Num repos` = n()) %>%
  filter(!is.na(year_first_commit) & !is.na(year_pubmed)) %>%
  rename(Topic = variable)

plt_data_years_by_topic$Topic <- sapply(plt_data_years_by_topic$Topic, function(x) {
  gsub("_", " ", gsub("topic_", "", gsub("RNA.seq", "RNA-seq", x)))
})

min_year <- min(c(plt_data_years_by_topic$year_first_commit, plt_data_years_by_topic$year_pubmed))
max_year <- max(c(plt_data_years_by_topic$year_first_commit, plt_data_years_by_topic$year_pubmed))
ggplot(plt_data_years_by_topic) +
  geom_point(aes(x = year_first_commit, 
                 y = year_pubmed,
                 size = `Num repos`)) +
  theme_bw() +
  theme(legend.text = element_text(size=14),
        legend.title = element_text(size = 16),
        axis.text = element_text(size = 14),
        strip.text = element_text(size = 14),
        axis.title = element_text(size = 16)) +
  facet_wrap(~Topic, scales = "fixed", ncol = 2) +
  xlim(min_year, max_year) +
  ylim(min_year, max_year) +
  xlab("Year of initial commit") +
  ylab("Year paper in PubMed")

ggsave("topics_by_year.pdf")
## Saving 10 x 11 in image
ggsave("topics_by_year.png")
## Saving 10 x 11 in image

Licenses

plt_data_licenses <- repo_data_all %>%
  select(license, is_high_profile)
plt_data_licenses$is_high_profile <- as.factor(plt_data_licenses$is_high_profile)
levels(plt_data_licenses$is_high_profile) <- c("Main repos", "High profile repos")
ggplot(plt_data_licenses, aes(x = license)) + geom_histogram(stat = "count") +
  facet_wrap(~is_high_profile, scales = "free") +
  theme_bw() +
  theme(axis.text.y = element_text(size = 14),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
        strip.text = element_text(size = 14),
        axis.title = element_text(size = 16)) +
  ylab("Number of repos") +
  xlab("License")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

ggsave("licenses.pdf")
## Saving 10 x 5 in image
ggsave("licenses.png")
## Saving 10 x 5 in image

Language features

exec_method <- list_tabledata(project = proj_main, 
                              dataset = ds_lang, 
                              table = table_exec_method) %>%
  mutate(language = tolower(language))

type_system <- list_tabledata(project = proj_main, 
                              dataset = ds_lang, 
                              table = table_type_system) %>%
  mutate(language = tolower(language))

lang_features <- data.frame(language = tolower(top_langs),
                            lang_header = top_langs_as_header) %>%
  left_join(exec_method, by = "language") %>%
  left_join(type_system, by = "language")
## Warning: Column `language` joining factor and character vector, coercing
## into character vector
sum_lang <- function(prefix, keep_rows) {
  sapply(lang_features$lang_header, 
         function(x) {
           sum(repo_data_all[which(keep_rows), 
                             paste(prefix, x, sep="")])
         })}

lang_features$bytes_high_profile <- sum_lang("bytes_", repo_data_all$is_high_profile)
lang_features$bytes_main <- sum_lang("bytes_", !repo_data_all$is_high_profile)
lang_features$files_high_profile <- sum_lang("num_files_", repo_data_all$is_high_profile)
lang_features$files_main <- sum_lang("num_files_", !repo_data_all$is_high_profile)

lang_features$exec <- ""
lang_features$type <- ""
for(i in 1:nrow(lang_features)) {
  
  # Execution method
  interpreted <- isTRUE(lang_features[i, "interpreted"])
  compiled <- isTRUE(lang_features[i, "compiled"])
  if(interpreted && compiled) lang_features[i, "exec"] <- "Both"
  else if(interpreted) lang_features[i, "exec"] <- "Interpreted"
  else if(compiled) lang_features[i, "exec"] <- "Compiled"
  else lang_features[i, "exec"] <- NA
  
  # Type system
  type <- NA
  strength <- lang_features[i, "strength"]
  system <- lang_features[i, "system"]
  safety <- lang_features[i, "safety"]
  if(!is.na(strength)) type <- capitalize(strength)
  if(!is.na(system)) type <- paste(type, system)
  if(!is.na(safety)) type <- paste(type, safety)
  lang_features[i, "type"] <- type
}

plt_data_lang_features <- lang_features %>% select(exec, type, bytes_high_profile,
                                                   bytes_main, files_high_profile, files_main) %>%
  filter(!is.na(exec) & !is.na(type)) %>% 
  melt(id.vars = c("exec", "type")) %>% 
  mutate(is_high_profile = grepl("high_profile", variable)) %>% 
  mutate(variable = gsub("_main", "", gsub("_high_profile", "", variable)))
plt_data_lang_features$var <- ""
plt_data_lang_features$color <- ""
for(i in 1:nrow(plt_data_lang_features)) {
  variable <- plt_data_lang_features[i, "variable"]
  high_prof <- plt_data_lang_features[i, "is_high_profile"]
  var <- capitalize(variable)
  if(high_prof) {
    var <- paste(var, "- high profile repos")
    color <- "h"
  } else {
    var <- paste(var, "- main repos")
    color <- "m"
  }
  plt_data_lang_features[i, "var"] <- var
  plt_data_lang_features[i, "color"] <- color
}
plt_data_lang_features <- plt_data_lang_features %>% 
  select(exec, type, var, value, color) %>%
  group_by(var) %>% 
  mutate(sum_var = sum(value)) %>% 
  ungroup() %>% 
  mutate(val_normalized = value / sum_var)

ggplot(plt_data_lang_features) +
  geom_point(aes(x = exec, 
                 y = type,
                 size = val_normalized,
                 col = color)) +
  scale_color_manual(values=c(color_high_prof, color_main)) +
  theme_bw() +
  theme(axis.text = element_text(size = 14),
        strip.text = element_text(size = 14),
        axis.title = element_text(size = 16),
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none") +
  facet_wrap(~var, scales = "free", ncol = 2) +
  xlab("Execution mode") +
  ylab("Type system")

ggsave("bytes_by_type_system.pdf")
## Saving 10 x 8 in image
ggsave("bytes_by_type_system.png")
## Saving 10 x 8 in image

Session info

sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] bindrcpp_0.2       jsonlite_1.5       scales_0.5.0      
##  [4] RColorBrewer_1.1-2 RISmed_2.1.7       purrr_0.2.4       
##  [7] Hmisc_4.1-1        Formula_1.2-2      survival_2.41-3   
## [10] lattice_0.20-35    bigrquery_0.4.1    lubridate_1.7.1   
## [13] ggplot2_2.2.1      dplyr_0.7.4        reshape2_1.4.3    
## 
## loaded via a namespace (and not attached):
##  [1] splines_3.4.3       colorspace_1.3-2    htmltools_0.3.6    
##  [4] yaml_2.1.16         base64enc_0.1-3     rlang_0.1.6        
##  [7] pillar_1.0.1        foreign_0.8-69      glue_1.2.0         
## [10] DBI_0.7             bindr_0.1           plyr_1.8.4         
## [13] stringr_1.2.0       munsell_0.4.3       gtable_0.2.0       
## [16] htmlwidgets_0.9     evaluate_0.10.1     labeling_0.3       
## [19] latticeExtra_0.6-28 knitr_1.18          curl_3.1           
## [22] htmlTable_1.11.2    Rcpp_0.12.14        acepack_1.4.1      
## [25] openssl_0.9.9       backports_1.1.2     checkmate_1.8.5    
## [28] gridExtra_2.3       digest_0.6.13       stringi_1.1.6      
## [31] grid_3.4.3          rprojroot_1.3-2     tools_3.4.3        
## [34] magrittr_1.5        lazyeval_0.2.1      tibble_1.4.1       
## [37] cluster_2.0.6       pkgconfig_2.0.1     Matrix_1.2-12      
## [40] data.table_1.10.4-3 assertthat_0.2.0    rmarkdown_1.8      
## [43] httr_1.3.1          rstudioapi_0.7      R6_2.2.2           
## [46] rpart_4.1-11        nnet_7.3-12         compiler_3.4.3